Categorical Feature Encoding Challenge: Kaggle Link
In this project the data file have lots of categorical features. We are asked to predict a binay target based on other features using various techniques of feature encodings.
# id features
'id',
# binary features
'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
# nominal features
'nom_0', 'nom_1','nom_2', 'nom_3', 'nom_4',
'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
# ordinal features
'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5',
# cyclical features
'day', 'month',
# binary target
'target'
Continuous Data
Categorical Data
Timeseries data : These can be continuous and discrete. They have time associated with them. When using train-test split we should not shuffle them.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.options.display.float_format = '{:,}'.format # df.A.value_counts().astype(float)
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import IPython
from IPython.display import display, HTML, Image, Markdown
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
%%capture
ENV_BHISHAN = None
try:
import bhishan
ENV_BHISHAN = True
print("Environment: Bhishan's Laptop")
except:
pass
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# load google drive
# from google.colab import drive
# drive.mount('/content/drive')
# dat_dir = 'drive/My Drive/Colab Notebooks/data/'
# sys.path.append(dat_dir)
# pip install
#!pip install pyldavis
# !pip install hyperopt
!pip install catboost
!pip install shap
#!pip install eli5
#!pip install lime
# !pip install category_encoders # TargetEncoder
# !pip install optuna # hyper param opt
# print
print('Environment: Google Colaboratory.')
if ENV_COLAB:
# update modules
!pip install -U scikit-learn
!pip install -U tqdm # tqdm needs restart run time.
pass
# encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
# folding
from sklearn.model_selection import KFold
# pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import scipy
# evaluation
from sklearn.metrics import roc_auc_score as auc
# boosting
import sklearn
import xgboost, lightgbm, catboost
from catboost import Pool, CatBoostClassifier
# extra modules
# import category_encoders
# from category_encoders import TargetEncoder
print([(x.__name__,x.__version__) for x in [xgboost,lightgbm, catboost,
sklearn,scipy]])
ifile = 'https://github.com/bhishanpdl/Project_Categorical_Feature_Encoding/blob/master/data/raw/train.csv?raw=true'
df = pd.read_csv(ifile)
df = df.astype(str) # make all string for catboost
print(df.shape)
df.head()
df.dtypes
[ (c,df[c].nunique()) for c in df.iloc[:,1:-1] ]
# df_train.info()
# df_test.info()
# as given in data description, the data set does not have any nulls.
target = 'target'
sns.countplot(df[target])
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
from sklearn.model_selection import train_test_split
Xtrain_orig, Xtest, ytrain_orig, ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
df_Xtrain_orig = pd.DataFrame(Xtrain_orig, columns=df.columns.drop(target))
df_Xtest = pd.DataFrame(Xtest, columns=df.columns.drop(target))
print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()
Xtrain, Xvalid, ytrain, yvalid = train_test_split(
Xtrain_orig,
ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ytrain_orig)
df_Xtrain = pd.DataFrame(Xtrain, columns=df.columns.drop(target))
df_Xvalid = pd.DataFrame(Xvalid, columns=df.columns.drop(target))
print(df_Xtrain.shape)
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
iterations=None,
learning_rate=None,
depth=None,
l2_leaf_reg=None,
model_size_reg=None,
rsm=None,
loss_function='RMSE',
border_count=None,
feature_border_type=None,
per_float_feature_quantization=None,
input_borders=None,
output_borders=None,
fold_permutation_block=None,
od_pval=None,
od_wait=None,
od_type=None,
nan_mode=None,
counter_calc_method=None,
leaf_estimation_iterations=None,
leaf_estimation_method=None,
thread_count=None,
random_seed=None,
use_best_model=None,
best_model_min_trees=None,
verbose=None,
silent=None,
logging_level=None,
metric_period=None,
ctr_leaf_count_limit=None,
store_all_simple_ctr=None,
max_ctr_complexity=None,
has_time=None,
allow_const_label=None,
one_hot_max_size=None,
random_strength=None,
name=None,
ignored_features=None,
train_dir=None,
custom_metric=None,
eval_metric=None,
bagging_temperature=None,
save_snapshot=None,
snapshot_file=None,
snapshot_interval=None,
fold_len_multiplier=None,
used_ram_limit=None,
gpu_ram_part=None,
pinned_memory_size=None,
allow_writing_files=None,
final_ctr_computation_mode=None,
approx_on_full_history=None,
boosting_type=None,
simple_ctr=None,
combinations_ctr=None,
per_feature_ctr=None,
ctr_target_border_count=None,
task_type=None,
device_config=None,
devices=None,
bootstrap_type=None,
subsample=None,
sampling_unit=None,
dev_score_calc_obj_block_size=None,
max_depth=None,
n_estimators=None,
num_boost_round=None,
num_trees=None,
colsample_bylevel=None,
random_state=None, # SEED = 100
reg_lambda=None,
objective=None,
eta=None,
max_bin=None,
gpu_cat_features_storage=None,
data_partition=None,
metadata=None,
early_stopping_rounds=None, # eg. 200
cat_features=None, # [0,1,2]
grow_policy=None,
min_data_in_leaf=None,
min_child_samples=None,
max_leaves=None,
num_leaves=None,
score_function=None,
leaf_estimation_backtracking=None,
ctr_history_unit=None,
monotone_constraints=None
)
cat_features = list(range(Xtrain.shape[1]))
# time
time_start = time.time()
# current parameters
Xtr = Xtrain
Xtx = Xtest
Xvd = Xvalid
ytr,ytx,yvd = ytrain, ytest,yvalid
# fit the model
model_cat = CatBoostClassifier(verbose=100,
random_state=SEED,
cat_features=list(range(Xtrain.shape[1])))
model.fit(Xtr, ytr,
eval_set=(Xvd, yvd))
# ypreds
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=skf)
# r-squared values
r = roc_auc_score(ytx, ypreds)
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('ROC AUC Score ', r)
from catboost import Pool
dtrain = Pool(
data=Xtrain,
label=ytrain,
cat_features=list(range(Xtrain.shape[1]))
)
dvalid = Pool(
data=Xvalid,
label=yvalid,
cat_features=list(range(Xtrain.shape[1]))
)
dtest = Pool(
data=Xtest,
label=ytest,
cat_features=list(range(Xtrain.shape[1]))
)
model = CatBoostClassifier(
custom_loss=['AUC', 'Accuracy']
)
model.fit(
iterations=1000,
dtrain,
eval_set=dvalid,
verbose=False,
plot=True # does not work in gcolab
);
model1 = CatBoostClassifier(
learning_rate=0.7,
iterations=500,
train_dir='learing_rate_0.7'
)
model2 = CatBoostClassifier(
learning_rate=0.01,
iterations=500,
train_dir='learing_rate_0.01'
)
model1.fit(dtrain, eval_set=dvalid, verbose=20)
model2.fit(dtrain, eval_set=dvalid, verbose=20);
from catboost import MetricVisualizer
MetricVisualizer(['learing_rate_0.7', 'learing_rate_0.01']).start()
model = CatBoostClassifier(
iterations=100,
use_best_model=True
)
model.fit(
dtrain,
eval_set=dvalid,
verbose=False,
plot=True
);
print('Tree count: ' + str(model.tree_count_))
from catboost import cv
params = {
'loss_function': 'Logloss',
'iterations': 80,
'custom_loss': 'AUC',
'learning_rate': 0.5,
}
cv_data = cv(
params = params,
pool = dtrain,
fold_count=5,
shuffle=True,
partition_random_seed=SEED,
plot=True,
verbose=False
)
cv_data.head(10)
best_value = cv_data['test-Logloss-mean'].min()
best_iter = cv_data['test-Logloss-mean'].values.argmin()
print('Best validation Logloss score, stratified: {:.4f}±{:.4f} on step {}'.format(
best_value,
cv_data['test-Logloss-std'][best_iter],
best_iter)
)
from sklearn.model_selection import GridSearchCV
param_grid = {
"learning_rate": [0.001, 0.01, 0.5],
}
clf = CatBoostClassifier(
iterations=200,
cat_features=cat_features,
verbose=100
)
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3)
results = grid_search.fit(Xtrain, ytrain)
results.best_estimator_.get_params()
model_with_early_stop = CatBoostClassifier(
iterations=200,
learning_rate=0.5,
early_stopping_rounds=20
)
model_with_early_stop.fit(
dtrain,
eval_set=dvalid,
verbose=False,
plot=True
);
print(model_with_early_stop.tree_count_)
model_with_early_stop = CatBoostClassifier(
eval_metric='AUC',
iterations=200,
learning_rate=0.5,
early_stopping_rounds=20
)
model_with_early_stop.fit(
dtrain,
eval_set=dvalid,
verbose=False,
plot=True
);
print(model_with_early_stop.tree_count_)
model = CatBoostClassifier(iterations=200, learning_rate=0.03)
model.fit(dtrain, verbose=50);
print(model.predict(dvalid)) # gives 0 and 1 with threhold 0.5
print(model.predict_proba(dvalid)) # actual probs
raw_pred = model.predict(
dvalid,
prediction_type='RawFormulaVal'
)
print(raw_pred)
from numpy import exp
sigmoid = lambda x: 1 / (1 + exp(-x))
probabilities = sigmoid(raw_pred)
print(probabilities)
import matplotlib.pyplot as plt
from catboost.utils import get_roc_curve
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve
curve = get_roc_curve(model, dvalid)
(fpr, tpr, thresholds) = curve
(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)
def plot_fpr_fnr(thresholds):
plt.figure(figsize=(16, 8))
style = {'alpha':0.5, 'lw':2}
plt.plot(thresholds, fpr, color='blue', label='FPR', **style)
plt.plot(thresholds, fnr, color='green', label='FNR', **style)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16);
plot_fpr_fnr(thresholds)
from catboost.utils import select_threshold
print(select_threshold(model, dvalid, FNR=0.01))
print(select_threshold(model, dvalid, FPR=0.01))
metrics = model.eval_metrics(
data=dvalid,
metrics=['Logloss','AUC'],
ntree_start=0,
ntree_end=0,
eval_period=1,
plot=True
)
print('AUC values:\n{}'.format(np.array(metrics['AUC'])))
Default feature importances for binary classification is PredictionValueChange - how much on average does the model change when the feature value changes. These feature importances are non negative. They are normalized and sum to 1, so you can look on these values like percentage of importance.
np.array(model.get_feature_importance(prettified=True))
The non default feature importance approximates how much the optimized loss function will change if the value of the feature changes. This importances might be negative if the feature has bad influence on the loss function. The importances are not normalized, the absolute value of the importance has the same scale as the optimized loss value. To calculate this importance value you need to pass train_pool as an argument.
np.array(model.get_feature_importance(
dtrain,
'LossFunctionChange',
prettified=True
))
print(model.predict_proba([Xvalid.iloc[1,:]]))
print(model.predict_proba([Xvalid.iloc[91,:]]))
shap_values = model.get_feature_importance(
dvalid,
'ShapValues'
)
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
print(shap_values.shape)
proba = model.predict_proba([Xvalid.iloc[1,:]])[0]
raw = model.predict([Xvalid.iloc[1,:]], prediction_type='RawFormulaVal')[0]
print('Probabilities', proba)
print('Raw formula value %.4f' % raw)
print('Probability from raw value %.4f' % sigmoid(raw))
import shap
shap.initjs()
shap.force_plot(expected_value, shap_values[1,:], Xvalid.iloc[1,:])
proba = model.predict_proba([Xvalid.iloc[91,:]])[0]
raw = model.predict([Xvalid.iloc[91,:]], prediction_type='RawFormulaVal')[0]
print('Probabilities', proba)
print('Raw formula value %.4f' % raw)
print('Probability from raw value %.4f' % sigmoid(raw))
import shap
shap.initjs()
shap.force_plot(expected_value, shap_values[91,:], Xvalid.iloc[91,:])
shap.summary_plot(shap_values, Xvalid)
#!rm 'catboost_info/snapshot.bkp'
model = CatBoostClassifier(
iterations=100,
save_snapshot=True,
snapshot_file='snapshot.bkp',
snapshot_interval=1
)
model.fit(dtrain, eval_set=dvalid, verbose=10);
model = CatBoostClassifier(iterations=10)
model.fit(dtrain, eval_set=dvalid, verbose=False)
model.save_model('catboost_model.bin')
model.save_model('catboost_model.json', format='json')
model.load_model('catboost_model.bin')
print(model.get_params())
print(model.learning_rate_)
tunned_model = CatBoostClassifier(
iterations=1000,
learning_rate=0.03,
depth=6,
l2_leaf_reg=3,
random_strength=1,
bagging_temperature=1
)
tunned_model.fit(
Xtrain, ytrain,
cat_features=cat_features,
verbose=False,
eval_set=(Xvalid, yvalid),
plot=True
);
fast_model = CatBoostClassifier(
boosting_type='Plain',
rsm=0.5,
one_hot_max_size=50,
leaf_estimation_iterations=1,
max_ctr_complexity=1,
iterations=100,
learning_rate=0.3,
bootstrap_type='Bernoulli',
subsample=0.5
)
fast_model.fit(
Xtrain, ytrain,
cat_features=cat_features,
verbose=False,
eval_set=(Xvalid, yvalid),
plot=True
);
small_model = CatBoostClassifier(
learning_rate=0.03,
iterations=500,
model_size_reg=50,
max_ctr_complexity=1,
ctr_leaf_count_limit=100
)
small_model.fit(
Xtrain, ytrain,
cat_features=cat_features,
verbose=False,
eval_set=(Xvalid, yvalid),
plot=True
);